{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic Regression Code Along"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Note that usually all imports would occur at the top and\n",
    "# most of this would be in an object this layout if for learning purposes only\n",
    "\n",
    "# Logistic Regression Example\n",
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder.appName('logreg').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Load training data\n",
    "data = spark.read.csv('titanic.csv',inferSchema=True,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- PassengerId: integer (nullable = true)\n",
      " |-- Survived: integer (nullable = true)\n",
      " |-- Pclass: integer (nullable = true)\n",
      " |-- Name: string (nullable = true)\n",
      " |-- Sex: string (nullable = true)\n",
      " |-- Age: double (nullable = true)\n",
      " |-- SibSp: integer (nullable = true)\n",
      " |-- Parch: integer (nullable = true)\n",
      " |-- Ticket: string (nullable = true)\n",
      " |-- Fare: double (nullable = true)\n",
      " |-- Cabin: string (nullable = true)\n",
      " |-- Embarked: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print the Schema of the DataFrame\n",
    "data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
      "|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|\n",
      "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
      "|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|\n",
      "|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|\n",
      "|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|\n",
      "|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|\n",
      "|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|\n",
      "|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|          330877| 8.4583| null|       Q|\n",
      "|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|\n",
      "|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|          349909| 21.075| null|       S|\n",
      "|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|\n",
      "|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|          237736|30.0708| null|       C|\n",
      "|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|         PP 9549|   16.7|   G6|       S|\n",
      "|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|          113783|  26.55| C103|       S|\n",
      "|         13|       0|     3|Saundercock, Mr. ...|  male|20.0|    0|    0|       A/5. 2151|   8.05| null|       S|\n",
      "|         14|       0|     3|Andersson, Mr. An...|  male|39.0|    1|    5|          347082| 31.275| null|       S|\n",
      "|         15|       0|     3|Vestrom, Miss. Hu...|female|14.0|    0|    0|          350406| 7.8542| null|       S|\n",
      "|         16|       1|     2|Hewlett, Mrs. (Ma...|female|55.0|    0|    0|          248706|   16.0| null|       S|\n",
      "|         17|       0|     3|Rice, Master. Eugene|  male| 2.0|    4|    1|          382652| 29.125| null|       Q|\n",
      "|         18|       1|     2|Williams, Mr. Cha...|  male|null|    0|    0|          244373|   13.0| null|       S|\n",
      "|         19|       0|     3|Vander Planke, Mr...|female|31.0|    1|    0|          345763|   18.0| null|       S|\n",
      "|         20|       1|     3|Masselmani, Mrs. ...|female|null|    0|    0|            2649|  7.225| null|       C|\n",
      "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------------+-------------------+------------------+------------------+------------------+-------------------+-----------------+\n",
      "|summary|      PassengerId|           Survived|            Pclass|               Age|             SibSp|              Parch|             Fare|\n",
      "+-------+-----------------+-------------------+------------------+------------------+------------------+-------------------+-----------------+\n",
      "|  count|              891|                891|               891|               714|               891|                891|              891|\n",
      "|   mean|            446.0| 0.3838383838383838| 2.308641975308642| 29.69911764705882|0.5230078563411896|0.38159371492704824| 32.2042079685746|\n",
      "| stddev|257.3538420152301|0.48659245426485753|0.8360712409770491|14.526497332334035|1.1027434322934315| 0.8060572211299488|49.69342859718089|\n",
      "|    min|                1|                  0|                 1|              0.42|                 0|                  0|              0.0|\n",
      "|    max|              891|                  1|                 3|              80.0|                 8|                  6|         512.3292|\n",
      "+-------+-----------------+-------------------+------------------+------------------+------------------+-------------------+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Drop missing data\n",
    "clean_data = data.na.drop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.linalg import Vectors\n",
    "from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder,VectorIndexer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Working with Categorical Columns\n",
    "\n",
    "Let's break this down into multiple steps to make it all clear."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['PassengerId',\n",
       " 'Survived',\n",
       " 'Pclass',\n",
       " 'Name',\n",
       " 'Sex',\n",
       " 'Age',\n",
       " 'SibSp',\n",
       " 'Parch',\n",
       " 'Ticket',\n",
       " 'Fare',\n",
       " 'Cabin',\n",
       " 'Embarked']"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data,test_data = clean_data.randomSplit([0.7,0.3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gender_indexer = StringIndexer(inputCol=\"Sex\", outputCol=\"SexIndex\")\n",
    "gender_encoder = OneHotEncoder(inputCol=\"SexIndex\", outputCol=\"SexVec\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embark_indexer = StringIndexer(inputCol=\"Embarked\", outputCol=\"EmbarkIndex\")\n",
    "embark_encoder = OneHotEncoder(inputCol=\"EmbarkIndex\", outputCol=\"EmbarkVec\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pipelines \n",
    "\n",
    "Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.linalg import Vectors\n",
    "from pyspark.ml.feature import VectorAssembler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "assembler = VectorAssembler(\n",
    "    inputCols=[\"Pclass\", \"SexVec\", \"Age\",\"SibSp\",\"Parch\",\"Fare\",\"EmbarkVec\"],\n",
    "    outputCol=\"features\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "logreg = LogisticRegression(featuresCol='features',labelCol='Survived')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark.ml import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- PassengerId: integer (nullable = true)\n",
      " |-- Survived: integer (nullable = true)\n",
      " |-- Pclass: integer (nullable = true)\n",
      " |-- Name: string (nullable = true)\n",
      " |-- Sex: string (nullable = true)\n",
      " |-- Age: double (nullable = true)\n",
      " |-- SibSp: integer (nullable = true)\n",
      " |-- Parch: integer (nullable = true)\n",
      " |-- Ticket: string (nullable = true)\n",
      " |-- Fare: double (nullable = true)\n",
      " |-- Cabin: string (nullable = true)\n",
      " |-- Embarked: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pipeline = Pipeline(stages=[gender_indexer,embark_indexer,\n",
    "                            gender_encoder,embark_encoder,assembler,logreg])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model = pipeline.fit(train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "results = model.transform(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- PassengerId: integer (nullable = true)\n",
      " |-- Survived: integer (nullable = true)\n",
      " |-- Pclass: integer (nullable = true)\n",
      " |-- Name: string (nullable = true)\n",
      " |-- Sex: string (nullable = true)\n",
      " |-- Age: double (nullable = true)\n",
      " |-- SibSp: integer (nullable = true)\n",
      " |-- Parch: integer (nullable = true)\n",
      " |-- Ticket: string (nullable = true)\n",
      " |-- Fare: double (nullable = true)\n",
      " |-- Cabin: string (nullable = true)\n",
      " |-- Embarked: string (nullable = true)\n",
      " |-- SexIndex: double (nullable = true)\n",
      " |-- EmbarkIndex: double (nullable = true)\n",
      " |-- SexVec: vector (nullable = true)\n",
      " |-- EmbarkVec: vector (nullable = true)\n",
      " |-- features: vector (nullable = true)\n",
      " |-- rawPrediction: vector (nullable = true)\n",
      " |-- probability: vector (nullable = true)\n",
      " |-- prediction: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "results.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "AUC = evaluator.evaluate(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.735144312393888"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AUC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Survived',\n",
    "                                             metricName='accuracy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "acc = evaluator.evaluate(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.76"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "acc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Great, you are now ready for your consulting project!"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
